library(data.table)
library(magrittr)
## Warning: package 'magrittr' was built under R version 4.0.5
library(kableExtra)
library(ggplot2); theme_set(theme_bw(base_size = 16))
library(patchwork)

Single-cell samples of human PBMCs that were aligned with CellRanger.

There are 3 donors and 3 treatments for each donor, i.e. 9 samples in total:

  • donors: B029, 5011, 5334
  • treatments:
    • 1: DMS
    • 3: IL15
    • 4: IL15+HODHbt

We are going to read in the .h5 files generated by CellRanger.

Details about h5 format are here

I used Seurat’s function for reading the H5 data (see readingH5.R).

options(menu.graphics=FALSE)
#library(SingleCellExperiment);library(Matrix)

data_dir <- "2022-04_Dennis/data/"
source("readingH5.R") 

smpls <- c("029-1","029-3","029-4","5011-1","5011-3","5011-4","5334-1","5334-3","5334-4")
smptab <- data.frame(filename=smpls,
    sample=c(
        paste("B029",c("DMSO","IL15","IL15HOD"), sep="_"),
        paste("OM5011",c("DMSO","IL15","IL15HOD"), sep="_"),
        paste("OM5334",c("DMSO","IL15","IL15HOD"), sep="_"))
)

scel <- lapply(smpls, function(x){
    mysmp <- subset(smptab, filename == x)$sample
    print(paste("Reading CellRanger output for", x))
    retsce <- Read10X_h5_2SCE(paste0(data_dir, "JonesLab_IL15J_10Xdata/", x, "/filtered_feature_bc_matrix.h5"), mysmp)
    return(retsce)
})
names(scel) <- smptab$sample

full_data <- do.call(cbind, lapply(scel, counts))
cell_info <- do.call(rbind, lapply(scel, colData))
gene_info <- rowData(scel[[1]])

## combine in one object
sce.all <- SingleCellExperiment(
    list(counts = full_data), 
    rowData = gene_info,
    colData = cell_info, 
    metadata = list(Samples = names(scel))
)

rm(scel); gc()

## remove completely uncovered genes
gnszero <- Matrix::rowSums(counts(sce.all)) == 0
sce.all <- sce.all[!gnszero, ]
#> dim(sce.all)
# [1] 22374 46086

## add QC info
is.mito <- grepl("mt-", ignore.case = TRUE, rowData(sce.all)$Symbol)
sce.all <- scuttle::addPerCellQC(sce.all, subsets=list(mitochondrial=is.mito))

mm <- lapply(unique(sce.all$Sample), function(x){
        ss <- sce.all[, sce.all$Sample == x]$subsets_mitochondrial_percent
        scuttle::isOutlier(ss, type = "higher")})
sce.all$mito.discard <- unlist(mm)
sce.all$mito.discard <- ifelse(is.na(sce.all$mito.discard), FALSE, sce.all$mito.discard)

## determine GENE count thresholds for each Sample individually
gg <- lapply(unique(sce.all$Sample), function(x){
    ss <- log10(sce.all[, sce.all$Sample == x]$detected)
    scuttle::isOutlier(ss)
})
sce.all$gene.discard <- unlist(gg)

## save colData
library(data.table);library(magrittr)
cd <-colData(sce.all) 
cd$cell <- rownames(cd)
cd <- as.data.frame(cd) %>% as.data.table
saveRDS(cd, file = paste0("colData_unfiltered_", Sys.Date(), ".rds"))

saveRDS(sce.all, file = paste0("sce_unfiltered_", Sys.Date(), ".rds"))

Finding filters

I typically do iterative rounds of filtering, starting with scuttle’s outlier detection and double-checking the distributions and comparing them across samples. This is necessary because different sample types will have different types of QC characteristics, e.g. blood cells are usually fairly small and contain little RNA, which is very different when dealing, for example, with hepatocytes or other metabolically heavily active cells.

## load the colData locally for Rmd
qcall <- readRDS(paste0(data_dir, "colData_unfiltered_2022-05-03.rds"))
qcall$condition <- gsub("_.*","", qcall$Sample)
qcall$replicate <-gsub("M", "", gsub(".*_","",qcall$Sample))
for(i in unique(qcall$condition)){
    dt <- qcall[condition == i]
    P1 <- ggplot(dt,
        aes(x = Sample, y = subsets_mitochondrial_percent, color = mito.discard)) +
        ggbeeswarm::geom_quasirandom(size = .2, alpha = .3, shape = 1) +
        theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=14)) +
        scale_color_manual(values = rev(c("firebrick2","grey75"))) +
        ylab("% mitochondrial genes") + coord_cartesian(ylim = c(0,20)) +
        theme(legend.position = "bottom")+
        guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
    P2 <- ggplot(dt,
        aes(x = subsets_mitochondrial_percent, y = log10(detected),
            color = mito.discard)) +
        geom_point(size = .2, shape = 1 , alpha = .5) +
        facet_wrap(~Sample, ncol = 2) +
        scale_color_manual(values = rev(c("firebrick2","grey75"))) +
        xlab("% mitochondrial genes") + ylab("log10(total number of genes)")+ 
        theme(legend.position = "bottom")+
        guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
    pw <- P1 | P2
    pw <- pw + plot_annotation(title = i) + plot_layout(widths = c(1.5,3))
    print(pw)
}

Let’s check which cells are indicated to be removed based on the gene content. Here, we show only those cells that survive the mito-filter above:

for(i in unique(qcall$condition)){
    dt <- qcall[condition == i & mito.discard != TRUE]
    ymax <- max(dt$subsets_mitochondrial_percent)
    P1 <- ggplot(dt,
        aes(x = Sample, y = subsets_mitochondrial_percent, color = gene.discard)) +
        ggbeeswarm::geom_quasirandom(size = .2, alpha = .3, shape = 1) +
        theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=14)) +
        scale_color_manual(values = rev(c("blue","grey75"))) +
        ylab("% mitochondrial genes") + coord_cartesian(ylim = c(0,ymax)) +
        theme(legend.position = "bottom") + 
        guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
    P2 <- ggplot(dt,
        aes(x = subsets_mitochondrial_percent, y = log10(detected),
            color = gene.discard)) +
        geom_point(size = .2, shape = 1 , alpha = .5) +
        facet_wrap(~Sample, ncol = 2) +
        scale_color_manual(values = rev(c("blue","grey75"))) +
        xlab("% mitochondrial genes") + ylab("log10(total number of genes)")+
        theme(legend.position = "bottom") +
        guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
    pw <- P1 | P2
    pw <- pw + plot_annotation(title = i, subtitle = "Cells that will be discarded due to the number of genes") + plot_layout(widths = c(1.5,3))
    print(pw)
}

I think, I will disregard scuttle’s gene.discard filter, and just remove those with very few genes:

  • B029: <2.6
  • OM511: <2.75
  • OM5334: <2.75 (except OM5334-Il15HOD: <2.5

First filtering

Will go with scuttle’s outlier detection for the mito content and with the above mentioned settings for the gene content.

# with scuttle's outlier detection
 table(qcall$Sample, qcall$gene.discard)
                
                 FALSE  TRUE
  B029_DMSO       4449   374
  B029_IL15       3394   145
  B029_IL15HOD    4992   177
  OM5011_DMSO     3191   389
  OM5011_IL15     4358   236
  OM5011_IL15HOD  2760   202
  OM5334_DMSO     2850   335
  OM5334_IL15     5490   392
  OM5334_IL15HOD 12083   269
qcall[, min_genes := ifelse(Sample %in% c("OM5011_DMSO", "OM5011_IL15","OM5011_IL15HOD", "OM5334_DMSO","OM5334_IL15"), 2.75,
    ifelse(grepl("^B029", qcall$Sample), 2.6, 2.5))]
qcall[ , gene.discard := ifelse(log10(detected) <= min_genes, TRUE, FALSE)]
qcall[, rm_cell := ifelse(mito.discard == TRUE | gene.discard == TRUE, TRUE, FALSE)]
> table(qcall$Sample, qcall$gene.discard)
                
                 FALSE  TRUE
  B029_DMSO       4752    71
  B029_IL15       3502    37
  B029_IL15HOD    5130    39
  OM5011_DMSO     3391   189
  OM5011_IL15     4404   190
  OM5011_IL15HOD  2791   171
  OM5334_DMSO     3060   125
  OM5334_IL15     5586   296
  OM5334_IL15HOD 12320    32

How many cells are going to be removed per sample?

table(qcall$rm_cell, qcall$Sample)
##        
##         B029_DMSO B029_IL15 B029_IL15HOD OM5011_DMSO OM5011_IL15 OM5011_IL15HOD
##   FALSE      4601      3403         5000        3359        4343           2725
##   TRUE        222       136          169         221         251            237
##        
##         OM5334_DMSO OM5334_IL15 OM5334_IL15HOD
##   FALSE        2966        5428          12074
##   TRUE          219         454            278
ggplot(qcall, aes(x = Sample, fill = rm_cell)) +geom_bar() + coord_flip()  +
  #  scale_color_manual(values = c("midnightblue","dodgerblue3","dodgerblue1")) +
    scale_fill_manual(values = c("grey75","orange")) +
    ylab("# cells") + ggtitle("How many cells will be removed per sample?") 

for(i in unique(qcall$condition)){
    dt <- qcall[condition == i & mito.discard!=TRUE]
    ymax <- max(dt$subsets_mitochondrial_percent)
    P1 <- ggplot(dt,
        aes(x = Sample, y = subsets_mitochondrial_percent, color = rm_cell)) +
        ggbeeswarm::geom_quasirandom(size = .2, alpha = .3, shape = 1) +
        theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=14)) +
        scale_color_manual(values = rev(c("orange","grey75"))) +
        ylab("% mitochondrial genes") + coord_cartesian(ylim = c(0,ymax)) +
        theme(legend.position = "bottom") +
        guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
    P2 <- ggplot(dt,
        aes(x = subsets_mitochondrial_percent, y = log10(detected),
            color = rm_cell)) +
        geom_point(size = .2, shape = 1 , alpha = .5) +
        facet_wrap(~Sample, ncol = 2) +
        scale_color_manual(values = rev(c("orange","grey75"))) +
        xlab("% mitochondrial genes") + ylab("log10(total number of genes)")+
        theme(legend.position = "bottom") +
        guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
    pw <- P1 | P2
    pw <- pw + plot_annotation(title = i) + plot_layout(widths = c(1.5,3))
    print(pw)
}

Filtering II

# save the colData to be added back to the SCE
saveRDS(qcall, file = paste0(data_dir, "colData_filtered_2022-05-03.rds"))
#$ scp colData_filtered_2022-05-03.rds frd2007@redteam2.pbtech:/scratchLocal/frd2007/2022-04_Dennis/2022-04-26_ReadingDataIn/

## on server
#sce.all = readRDS("sce_unfiltered_2022-05-03.rds")
cd <- colData(sce.all)
cd$cell = row.names(cd)
cd <- as.data.frame(cd) %>% as.data.table

qcall <- readRDS("colData_filtered_2022-05-03.rds")
qcall <- qcall[, c("Sample","Barcodes","cell","rm_cell", "min_genes")] %>% .[cd, on = c("Sample", "Barcodes","cell")]

qcall.df <- DataFrame(as.data.frame(qcall))
rownames(qcall.df) <- qcall.df$cell
colData(sce.all) <- qcall.df[colnames(sce.all),]

## remove cells
sce.all <- sce.all[, !sce.all$rm_cell]
gnszero <- Matrix::rowSums(counts(sce.all)) == 0
sce.all <- sce.all[!gnszero, ]
#> dim(sce.all)
#[1] 22266 43899
saveRDS(sce.all, file = "sce_filtered_2022-05-03.rds")

Cell cycle

library(Matrix);library(scran);library(BiocParallel)
#library(SingleCellExperiment);

#sce = readRDS("sce_filtered_2022-03-25.rds")

hs.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran"))

cc.noNormAllCells <- list()

samples = unique(sce.all$Sample)

for(i in samples){
  set.seed(123)
  message(i)
  p = bpstart(MulticoreParam(12))
  sc.tmp <- sce.all[, sce.all$Sample == i]
  cc.noNorm <- scran::cyclone(sc.tmp, pairs=hs.pairs, BPPARAM=p)
  names(cc.noNorm$phases) <- colnames(sc.tmp)
  row.names(cc.noNorm$scores) <- colnames(sc.tmp)
  row.names(cc.noNorm$normalized.scores) <- colnames(sc.tmp)
  cc.noNormAllCells[[i]] <- cc.noNorm
  rm(sc.tmp)
  rm(cc.noNorm)
  gc()
}

phs <- lapply(cc.noNormAllCells, function(x) x$phases) %>% unlist
names(phs) <- sub(".*?\\.","", names(phs))
colData(sce.all)$cc_phase <- phs[colnames(sce.all)]

g1phs <- lapply(cc.noNormAllCells, function(x) x$score$G1) %>% unlist
#names(g1phs) <- sub(".*?\\.","", names(g1phs))
colData(sce.all)$G1score <- g1phs

g2mphs <- lapply(cc.noNormAllCells, function(x) x$score$G2M) %>% unlist
colData(sce.all)$G2Mscore <- g2mphs

##!saveRDS(cc.noNormAllCells, file="cc.noNormAllCells.rds")
##!saveRDS(sce.all, file="sce_filtered_2022-05-03.rds")

Integration

#rownames(sce.all) <- scater::uniquifyFeatureNames(rowData(sce.all)$ID, #rowData(sce.all)$Symbol)

source("/scratchLocal/frd2007/2022-04_Dennis/src/Mnn.R")
library(SingleCellExperiment); library(scater);library(scran);library(batchelor);library(magrittr)

## load previously filtered sample
scf <- readRDS("sce_filtered_2022-05-03.rds")
#scf <- sce.all
#rm(sce.all);gc()

## getting a list ----------------------
scel <- lapply(unique(scf$Sample), function(x){
    individualize_samples(scf[, scf$Sample == x], pf = "allInteg",
        shared_colData = names(colData(scf)))})
names(scel) <- unique(scf$Sample)

## MNN-correction, UMAPing, clustering --> see Mnn.R for details----------------
## First, do integration with all genes, including ery-genes:
scf.mnn <- MNN_correct(scel, hvg_n = 2500, 
    shared_colData = c("Sample", "Barcodes","cell"),
    fn="Dennis_sampleIntegration_allGenes_2022-05-03", 
    cluster_ks = c(50,100,150,200,250,300),
    save_hvg = TRUE, ignore_genes=NULL)

rownames(scf.mnn) <- scater::uniquifyFeatureNames(rowData(scf.mnn)$ID, rowData(scf.mnn)$Symbol)
saveRDS(scf.mnn, file = "sce_Dennis_sampleIntegration_2022-05-03.rds")

scf.mnn.nc <- scf.mnn
counts(scf.mnn.nc) <- NULL
saveRDS(scf.mnn.nc, file = "sce_Dennis_sampleIntegration_2022-05-03_noCounts.rds")

SingleR

library(celldex)
library(SingleR)
scf.mnn <-readRDS(file = "sce_Dennis_sampleIntegration_2022-05-03.rds")
rownames(scf.mnn) <- rowData(scf.mnn)$ID
hpca <- celldex::HumanPrimaryCellAtlasData(ensembl=TRUE)

## Performing predictions
predictedLabels <- SingleR(
    test=scf.mnn, assay.type.test="logcounts", 
    ref=hpca, 
    labels=hpca$label.main)
##!save(predictedLabels, file = "SingleR_HPCALabels.rda")

scf.mnn$labPredict_HPCA <- predictedLabels$pruned.labels
scf.mnn$labPredict_HPCA_prePrune <- predictedLabels$labels

rownames(scf.mnn) <- scater::uniquifyFeatureNames(ID=rowData(scf.mnn)$ID, names = rowData(scf.mnn)$Symbol)
saveRDS(scf.mnn, file = "sce_Dennis_sampleIntegration_2022-05-03.rds")

scf.mnn.nc <- scf.mnn
counts(scf.mnn.nc) <- NULL
saveRDS(scf.mnn.nc, file = "sce_Dennis_sampleIntegration_2022-05-03_noCounts.rds")
## labels assigned to our data
> table(scf.mnn$labPredict_HPCA)

          B_cell              CMP               DC              GMP 
            1658               49              598               10 
      HSC_-G-CSF       Macrophage              MEP         Monocyte 
               1              273               10             2879 
     Neutrophils          NK_cell Pre-B_cell_CD34- Pro-B_cell_CD34+ 
             222             7935               36                4 
         T_cells 
           30091 

## labels present in the reference data set
> table(hpca$label.main)

           Astrocyte               B_cell                   BM 
                   2                   26                    7 
          BM & Prog.         Chondrocytes                  CMP 
                   1                    8                    2 
                  DC Embryonic_stem_cells    Endothelial_cells 
                  88                   17                   64 
    Epithelial_cells         Erythroblast          Fibroblasts 
                  16                    8                   10 
         Gametocytes                  GMP          Hepatocytes 
                   5                    2                    3 
          HSC_-G-CSF            HSC_CD34+            iPS_cells 
                  10                    6                   42 
       Keratinocytes           Macrophage                  MEP 
                  25                   90                    2 
            Monocyte                  MSC            Myelocyte 
                  60                    9                    2 
Neuroepithelial_cell              Neurons          Neutrophils 
                   1                   16                   21 
             NK_cell          Osteoblasts            Platelets 
                   5                   15                    5 
    Pre-B_cell_CD34-     Pro-B_cell_CD34+        Pro-Myelocyte 
                   2                    2                    2 
 Smooth_muscle_cells              T_cells    Tissue_stem_cells 
                  16                   68                   55 

SessionInfo

R version 4.1.2 (2021-11-01)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux Server release 6.3 (Santiago)

Matrix products: default
BLAS/LAPACK: /pbtech_mounts/homes022/frd2007/miniconda3/envs/r4_env/lib/libopenblasp-r0.3.18.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] SingleR_1.8.1               celldex_1.4.0              
 [3] scater_1.22.0               ggplot2_3.3.5              
 [5] batchelor_1.10.0            hdf5r_1.3.5                
 [7] magrittr_2.0.3              data.table_1.14.2          
 [9] BiocParallel_1.28.3         scran_1.22.1               
[11] scuttle_1.4.0               SingleCellExperiment_1.16.0
[13] SummarizedExperiment_1.24.0 Biobase_2.54.0             
[15] GenomicRanges_1.46.1        GenomeInfoDb_1.30.1        
[17] IRanges_2.28.0              S4Vectors_0.32.4           
[19] BiocGenerics_0.40.0         MatrixGenerics_1.6.0       
[21] matrixStats_0.61.0          Matrix_1.4-1               

loaded via a namespace (and not attached):
 [1] ggbeeswarm_0.6.0              colorspace_2.0-3             
 [3] ellipsis_0.3.2                bluster_1.4.0                
 [5] XVector_0.34.0                BiocNeighbors_1.12.0         
 [7] ggrepel_0.9.1                 bit64_4.0.5                  
 [9] interactiveDisplayBase_1.32.0 AnnotationDbi_1.56.2         
[11] fansi_1.0.3                   sparseMatrixStats_1.6.0      
[13] cachem_1.0.6                  ResidualMatrix_1.4.0         
[15] cluster_2.1.3                 dbplyr_2.1.1                 
[17] png_0.1-7                     shiny_1.7.1                  
[19] BiocManager_1.30.16           compiler_4.1.2               
[21] httr_1.4.2                    dqrng_0.3.0                  
[23] assertthat_0.2.1              fastmap_1.1.0                
[25] limma_3.50.1                  cli_3.2.0                    
[27] later_1.3.0                   BiocSingular_1.10.0          
[29] htmltools_0.5.2               tools_4.1.2                  
[31] rsvd_1.0.5                    igraph_1.3.0                 
[33] gtable_0.3.0                  glue_1.6.2                   
[35] GenomeInfoDbData_1.2.7        dplyr_1.0.8                  
[37] rappdirs_0.3.3                Rcpp_1.0.8.3                 
[39] vctrs_0.4.0                   Biostrings_2.62.0            
[41] ExperimentHub_2.2.1           DelayedMatrixStats_1.16.0    
[43] beachmat_2.10.0               mime_0.12                    
[45] lifecycle_1.0.1               irlba_2.3.5                  
[47] statmod_1.4.36                AnnotationHub_3.2.2          
[49] edgeR_3.36.0                  zlibbioc_1.40.0              
[51] scales_1.1.1                  promises_1.2.0.1             
[53] parallel_4.1.2                yaml_2.3.5                   
[55] curl_4.3.2                    memoise_2.0.1                
[57] gridExtra_2.3                 RSQLite_2.2.12               
[59] BiocVersion_3.14.0            ScaledMatrix_1.2.0           
[61] filelock_1.0.2                rlang_1.0.2                  
[63] pkgconfig_2.0.3               bitops_1.0-7                 
[65] lattice_0.20-45               purrr_0.3.4                  
[67] bit_4.0.4                     tidyselect_1.1.2             
[69] R6_2.5.1                      generics_0.1.2               
[71] metapod_1.2.0                 DelayedArray_0.20.0          
[73] DBI_1.1.2                     pillar_1.7.0                 
[75] withr_2.5.0                   KEGGREST_1.34.0              
[77] RCurl_1.98-1.6                tibble_3.1.6                 
[79] crayon_1.5.1                  utf8_1.2.2                   
[81] BiocFileCache_2.2.1           viridis_0.6.2                
[83] locfit_1.5-9.5                grid_4.1.2                   
[85] blob_1.2.2                    digest_0.6.29                
[87] xtable_1.8-4                  httpuv_1.6.5                 
[89] munsell_0.5.0                 beeswarm_0.4.0               
[91] viridisLite_0.4.0             vipor_0.4.5